package com.sparrow.constant;

import java.util.regex.Pattern;

/**
 * 常用正则表达式
 * 
 * @author zhanglizhi
 * @date 2011-12-25 上午10:57:08
 * @version 1.0
 */
public class REGEX {

	/**
	 * 常用匹配模式选项
	 */
	public static final int OPTION = Pattern.MULTILINE
			| Pattern.CASE_INSENSITIVE | Pattern.CANON_EQ;
	
	/**
	 * 破坏的HTML标签 br除外
	 */
	public static final String TAG_BREAK_HTML = "<^br*?>";
	/**
	 * 页面标题
	 */
	public static final String PAGE_TITLE = "<title>([\\s\\S]*?)<\\/title>";
	/**
	 * 站内图片格式image图片标签
	 * 
	 * img.zhuaququ.com
	 */
	public static final String TAG_INNER_IMAGE = "<img[\\s\\S]*?src=([\"\'])(http:\\/\\/img[0-9]+\\.zhuaququ\\.net\\/.*?)\\1[\\s\\S]*?\\/?>";

	/**
	 * 临时图片存储格式
	 */
	public static final String TAG_TEMP_IMAGE_FORMAT = "<img src=\"%1$s\" id=\"tempImage%2$s\"/>";
	/**
	 * 临时image图片标签 temp.zhuaququ.com
	 */
	public static final String TAG_TEMP_IMAGE = "<img[\\s\\S]*?src=([\"\'])(http:\\/\\/temp\\.zhuaququ\\.net\\/.*?)\\1[\\s\\S]*?\\/?>";
	/**
	 * 第三方网站（抓取时使用）图片标签
	 * 所有图片格式
	 */
	public static final String TAG_IMAGE = "<img[\\s\\S]*?src=([\"\'])(.*?)\\1[\\s\\S]*?\\/?>";
	/**
	 * flashTag标签
	 */
	public static final String TAG_FLASH = "<embed.*?src=([\"']?)(.*?)\\1\\s.*?>(<\\/embed>)?";
	/**
	 * 回车换行
	 */
	public static final String TAG_BR = "<br/?>+";
	/**
	 * HTML空格
	 */
	public static final String TAG_HTML_SPACE = "&nbsp;+";

	/**
	 * 转义字符
	 */
	public static final String TAG_HTML_ESCAPE = "&[A-Za-z]*?;";

	/**
	 * 块标签
	 * 
	 * div|p|ul|table|dl|h[1-6]
	 */
	
	
	public static final String TAG_BLOCK = "</?div>|</?p>|<br/?>|</?dl>|</?table>|<hr/?>|</?h[1-6]>";

	/**
	 * 带内容的块标签
	 */
	public static final String TAG_BLOCK_WITH_CONTENT = "<(div|p|ul|table|dl|h[1-6])>(.*?)<\\/\\1>";

	public static final String ANNOTATION_HTML = "<!--[\\s\\S]*?-->";
}
